==================================================================================================================
# Load the necessary libraries for data manipulation, visualization, and interactive visualization
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
==================================================================================================================
# Load the library for reading Excel files and read data from the "DATA" sheet in the "data.xlsx" file
library(readxl)
df <- read_excel("data.xlsx", sheet = "DATA")
str(df)
## tibble [12,093 × 17] (S3: tbl_df/tbl/data.frame)
## $ rec : num [1:12093] 1 2 3 4 5 6 7 8 9 10 ...
## $ year : num [1:12093] 1911 1911 1911 1911 1911 ...
## $ session_no : num [1:12093] 1 2 2 2 2 2 2 2 2 2 ...
## $ paper_no : num [1:12093] 1 1 2 NA NA NA NA NA NA NA ...
## $ session_title : chr [1:12093] "Annual Address of the President of the American Economic Association" "Money and Prices" "Money and Prices" "Money and Prices" ...
## $ paper_title : chr [1:12093] "The Economic Significance of a Comprehensive System of National Education" "Causes of the Changes in Prices Since 1896" "Recent Changes in Price Levels and Their Causes" "Discussion" ...
## $ authors : chr [1:12093] "Edmund J. James" "J. Laurence Laughlin" "Irving Fisher" "D. F. Houston" ...
## $ name : chr [1:12093] "Edmund J. James" "J. Laurence Laughlin" "Irving Fisher" "D. F. Houston" ...
## $ role : num [1:12093] 4 1 1 2 2 2 2 2 2 2 ...
## $ num_auth : num [1:12093] 1 1 1 1 1 1 1 1 1 1 ...
## $ gender : num [1:12093] 1 1 1 1 1 1 1 1 1 1 ...
## $ JEL : chr [1:12093] "I1" "E" "E" "E" ...
## $ field : num [1:12093] 8 4 4 4 4 4 4 4 4 4 ...
## $ CSWEP : num [1:12093] 0 0 0 0 0 0 0 0 0 0 ...
## $ prim_institution: chr [1:12093] "University of Illinois" "University of Chicago" "Yale University" "Washington University in St. Louis" ...
## $ inst_type : num [1:12093] 2 1 1 2 1 1 1 1 1 1 ...
## $ full_institution: chr [1:12093] "University of Illinois" "University of Chicago" "Yale University" "Washington University in St. Louis" ...
==================================================================================================================
# Remove columns that are not needed for the analysis from the dataframe
df <- df %>% select(-session_no, -paper_no, -session_title, -paper_title, -authors, -name, -num_auth, -JEL, -CSWEP, -prim_institution, -inst_type, -full_institution)
# Display the modified dataframe after dropping columns
print(df)
## # A tibble: 12,093 × 5
## rec year role gender field
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1911 4 1 8
## 2 2 1911 1 1 4
## 3 3 1911 1 1 4
## 4 4 1911 2 1 4
## 5 5 1911 2 1 4
## 6 6 1911 2 1 4
## 7 7 1911 2 1 4
## 8 8 1911 2 1 4
## 9 9 1911 2 1 4
## 10 10 1911 2 1 4
## # ℹ 12,083 more rows
str(df)
## tibble [12,093 × 5] (S3: tbl_df/tbl/data.frame)
## $ rec : num [1:12093] 1 2 3 4 5 6 7 8 9 10 ...
## $ year : num [1:12093] 1911 1911 1911 1911 1911 ...
## $ role : num [1:12093] 4 1 1 2 2 2 2 2 2 2 ...
## $ gender: num [1:12093] 1 1 1 1 1 1 1 1 1 1 ...
## $ field : num [1:12093] 8 4 4 4 4 4 4 4 4 4 ...
# Count and print the number of missing values (NA) in each relevant column
print(sum(is.na(df$rec)))
## [1] 0
print(sum(is.na(df$year)))
## [1] 0
print(sum(is.na(df$role)))
## [1] 0
print(sum(is.na(df$gender)))
## [1] 0
print(sum(is.na(df$field)))
## [1] 0
# Checking number of row which have duplicate data
jumlah_duplikat <- sum(duplicated(df))
cat("Number of duplicate rows:", jumlah_duplikat, "\n")
## Number of duplicate rows: 0
# Display unique values in the 'gender', 'role', and 'field' columns to understand the variation in the data
print(unique(df$gender))
## [1] 1 2 3
print(unique(df$role))
## [1] 4 1 2 5 3
print(unique(df$field))
## [1] 8 4 1 9 10 7 6 5 11 3 2
# Filter data to remove rows with invalid 'gender' values (value 3), add 'male' and 'female' columns based on the 'gender' value
df <- df %>%
filter(gender != 3) %>%
mutate(
male = ifelse(gender == 1, 1, 0),
female = ifelse(gender == 2, 1, 0)
)
# Remove the 'gender' column as it is no longer needed
df <- df %>% select(-gender)
str(df)
## tibble [12,092 × 6] (S3: tbl_df/tbl/data.frame)
## $ rec : num [1:12092] 1 2 3 4 5 6 7 8 9 10 ...
## $ year : num [1:12092] 1911 1911 1911 1911 1911 ...
## $ role : num [1:12092] 4 1 1 2 2 2 2 2 2 2 ...
## $ field : num [1:12092] 8 4 4 4 4 4 4 4 4 4 ...
## $ male : num [1:12092] 1 1 1 1 1 1 1 1 1 1 ...
## $ female: num [1:12092] 0 0 0 0 0 0 0 0 0 0 ...
# Convert numeric values in the 'role' column to more descriptive labels
df <- df %>%
mutate(
role = case_when(
role == 1 ~ "Paper Author",
role == 2 ~ "Discussant",
role == 3 ~ "Roundtable Participant",
role == 4 ~ "Address",
role == 5 ~ "Roundtable Chair",
TRUE ~ "unknown"
)
)
print(df)
## # A tibble: 12,092 × 6
## rec year role field male female
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 1 1911 Address 8 1 0
## 2 2 1911 Paper Author 4 1 0
## 3 3 1911 Paper Author 4 1 0
## 4 4 1911 Discussant 4 1 0
## 5 5 1911 Discussant 4 1 0
## 6 6 1911 Discussant 4 1 0
## 7 7 1911 Discussant 4 1 0
## 8 8 1911 Discussant 4 1 0
## 9 9 1911 Discussant 4 1 0
## 10 10 1911 Discussant 4 1 0
## # ℹ 12,082 more rows
# Convert numeric values in the 'field' column to more descriptive labels
df <- df %>%
mutate(
field = case_when(
field == 1 ~ "Econonomics Profession",
field == 2 ~ "Econometrics",
field == 3 ~ "Microeconomics",
field == 4 ~ "Macroeconomics",
field == 5 ~ "International",
field == 6 ~ "Public",
field == 7 ~ "Labor",
field == 8 ~ "Health & Education",
field == 9 ~ "Industrial Organization",
field == 10 ~ "Environment",
field == 11 ~ "Others",
TRUE ~ "unknown"
)
)
print(df)
## # A tibble: 12,092 × 6
## rec year role field male female
## <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 1 1911 Address Health & Education 1 0
## 2 2 1911 Paper Author Macroeconomics 1 0
## 3 3 1911 Paper Author Macroeconomics 1 0
## 4 4 1911 Discussant Macroeconomics 1 0
## 5 5 1911 Discussant Macroeconomics 1 0
## 6 6 1911 Discussant Macroeconomics 1 0
## 7 7 1911 Discussant Macroeconomics 1 0
## 8 8 1911 Discussant Macroeconomics 1 0
## 9 9 1911 Discussant Macroeconomics 1 0
## 10 10 1911 Discussant Macroeconomics 1 0
## # ℹ 12,082 more rows
# Checking Data
print(colnames(df))
## [1] "rec" "year" "role" "field" "male" "female"
str(df)
## tibble [12,092 × 6] (S3: tbl_df/tbl/data.frame)
## $ rec : num [1:12092] 1 2 3 4 5 6 7 8 9 10 ...
## $ year : num [1:12092] 1911 1911 1911 1911 1911 ...
## $ role : chr [1:12092] "Address" "Paper Author" "Paper Author" "Discussant" ...
## $ field : chr [1:12092] "Health & Education" "Macroeconomics" "Macroeconomics" "Macroeconomics" ...
## $ male : num [1:12092] 1 1 1 1 1 1 1 1 1 1 ...
## $ female: num [1:12092] 0 0 0 0 0 0 0 0 0 0 ...
==================================================================================================================
library(psych)
## Warning: package 'psych' was built under R version 4.3.3
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
# to find summary staistic on data numerical
numeric_df <- df %>%
select(where(is.numeric))
describe(numeric_df)
## vars n mean sd median trimmed mad min max range skew
## rec 1 12092 6046.81 3491.17 6046.5 6046.76 4482.64 1 12093 12092 0.00
## year 2 12092 1982.47 30.35 1990.0 1985.28 34.10 1911 2020 109 -0.60
## male 3 12092 0.86 0.34 1.0 0.96 0.00 0 1 1 -2.13
## female 4 12092 0.14 0.34 0.0 0.04 0.00 0 1 1 2.13
## kurtosis se
## rec -1.20 31.75
## year -0.77 0.28
## male 2.52 0.00
## female 2.52 0.00
Descriptive statistics for numeric variables are useful for understanding the data by providing information on the count, mean, median, minimum, and maximum values. Additionally, they help determine whether the data is symmetric or skewed.
==================================================================================================================
# This interactive bar chart shows the percentage of how much males and females contribute to an economic research/journal presented in the AEA annual meeting. Each contributor has each role in the research, this visualization highlights the percentage of males and females in each role.
df_summary <- df %>%
group_by(role) %>%
summarize(
male = sum(male),
female = sum(female)
) %>%
mutate(total = male + female,
male_percentage = scales::percent(male / total, accuracy = 0.1),
female_percentage = scales::percent(female / total, accuracy = 0.1))
plot_ly(data = df_summary, type = 'bar') %>%
add_trace(x = ~role, y = ~male, name = 'Male',
hoverinfo = 'text', text = ~paste("Male: ", male_percentage), showlegend = TRUE) %>%
add_trace(x = ~role, y = ~female, name = 'Female',
hoverinfo = 'text', text = ~paste("Female: ", female_percentage), showlegend = TRUE) %>%
layout(
title = "Number of Contributions by Roles and Gender",
xaxis = list(title = "Roles"),
yaxis = list(title = "Number of Contributions"),
barmode = 'group'
)
## Warning: Can't display both discrete & non-discrete data on same axis
The chart tells that most female researcher are authors of the paper at 16% which is a low number compared to the males’. It is however better than the other roles as all the other role has <10% and reaching it’s lowest at 2.5%. This can also tell that most papers released with a female contributor is mostly authors which means they have to be independent and is less likely to contribute if not as authors
==================================================================================================================
# This interactive bar chart shows the percentage of how much males and females contribute to an economic research/journal presented in the AEA annual meeting. Each contributor has each role in the research, this visualization highlights the percentage of males and females in each role.
df_summary <- df %>%
group_by(role) %>%
summarize(
male = sum(male),
female = sum(female)
) %>%
mutate(total = male + female,
male_percentage = scales::percent(male / total, accuracy = 0.1),
female_percentage = scales::percent(female / total, accuracy = 0.1))
plot_ly(data = df_summary,
marker = list(size = 10)) %>%
add_trace(x = ~role, y = ~male, type = 'scatter', mode = 'lines+markers', name = 'Male', hoverinfo = 'text', text = ~paste("Role : ", role, "<br>",
"Male Percentage : ", male_percentage)) %>%
add_trace(x = ~role, y = ~female, type = 'scatter', mode = 'lines+markers', name = 'Female', hoverinfo = 'text', text = ~paste("Role : ", role, "<br>",
"Female Percentage : ", female_percentage)) %>%
layout(
title = "Number and Percentage of Contributions by Role and Gender",
xaxis = list(title = "Role"),
yaxis = list(title = "Number of Contributions")
)
The chart tells that most papers released with a female contributor is mostly authors which means they have to be independent and is less likely to contribute if not as authors. This may conclude that male authors will likely work with other male contributor rather than a female contributor
==================================================================================================================
# This interactive bar chart shows the percentage of how much males and females contribute to an economic research/journal presented in the AEA annual meeting. This chart highlights the contribution of males and females in the various field of economics
df_summary <- df %>%
group_by(field) %>%
summarize(
male = sum(male),
female = sum(female)
) %>%
mutate(total = male + female,
male_percentage = scales::percent(male / total, accuracy = 0.1),
female_percentage = scales::percent(female / total, accuracy = 0.1))
plot_ly(data = df_summary, type = 'bar') %>%
add_trace(x = ~field, y = ~male, name = 'Male',
hoverinfo = 'text', text = ~paste("Male: ", male_percentage), showlegend = TRUE) %>%
add_trace(x = ~field, y = ~female, name = 'Female',
hoverinfo = 'text', text = ~paste("Female: ", female_percentage), showlegend = TRUE) %>%
layout(
title = "Number of Contributions by Field and Gender",
xaxis = list(title = "Field"),
yaxis = list(title = "Number of Contributions"),
barmode = 'group'
)
## Warning: Can't display both discrete & non-discrete data on same axis
From this chart, it is visible that all of the fields in economics are predominantly males. The field which females contribute most in terms of percentage are Labor economics and Health & Education of economics standing at 32%.
==================================================================================================================
# Creating two interactive line charts using Plotly that show the growth of contributions by male and female roles over time
df_summary_year_malee <- df %>%
group_by(year, role) %>%
summarize(male = sum(male)) %>%
ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
df_summary_year_femalee <- df %>%
group_by(year, role) %>%
summarize(female = sum(female)) %>%
ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
plot_ly(data = df_summary_year_malee, type = 'scatter', mode = 'lines+markers') %>%
add_trace(x = ~year, y = ~male, color = ~role, text = ~paste("Role: ", role, "<br>", "Year: ", year, "<br>", "Male: ", male),
hoverinfo = 'text') %>%
layout(
title = "Growth of Male Roles Per Year",
xaxis = list(title = "Year"),
yaxis = list(title = "Number of Male Contributions")
)
plot_ly(data = df_summary_year_femalee, type = 'scatter', mode = 'lines+markers') %>%
add_trace(x = ~year, y = ~female, color = ~role, text = ~paste("Role: ", role, "<br>", "Year: ", year, "<br>", "Female: ", female),
hoverinfo = 'text') %>%
layout(
title = "Growth of Female Roles Per Year",
xaxis = list(title = "Year"),
yaxis = list(title = "Number of Female Contributions")
)
Consistent with the first visualization results using a bar plot, it is evident that the growth of the “paper author” role tends to increase year by year for both men and women. This clearly makes the “paper author” role the most frequent role to date. Meanwhile, the roles of “Roundtable Chair” and “Participant” has been stopped before 1970. So far, there is no data indicating that these roles have increased.
==================================================================================================================
#Creating an interactive line chart using Plotly that shows the proportion of contributions per year based on role
df_summary_role_year_prop <- df %>%
group_by(year, role) %>%
summarize(male = sum(male), female = sum(female), .groups = 'drop') %>%
mutate(total = male + female) %>%
group_by(year) %>%
mutate(prop = total / sum(total)) %>%
ungroup()
plot_ly(df_summary_role_year_prop, x = ~year, y = ~prop, color = ~role, type = 'scatter', mode = 'lines+markers', fill = 'tozeroy', text = ~paste("Year: ", year, "<br>Role: ", role, "<br>Proportion: ", scales::percent(prop, accuracy = 0.1))) %>%
layout(
title = "Proportion Contribution per Year by Role",
xaxis = list(title = "Year"),
yaxis = list(title = "Contribution Proportion", tickformat = "%"),
hovermode = "closest"
)
From the visualization above, we can see that the “Discussant” role had the highest contribution at the beginning of the 20th century. However, over time, the “Paper Author” role has become the most dominant, with contributions exceeding 90% in the last 40 years. Although the “Roundtable Participant” role shows a 0 contribution this year, but in the early 20th century specifically from 1926-1934, the “Roundtable Participant” role had more contributions than the “Paper Author” role. This clearly indicates that the proportion of contributions by each role tended to be fluctuating in the early 20th century and has been dominated by the “Paper Author” role in the last 40 years.
==================================================================================================================
# This interactive bar chart shows the percentage of how much males and females contribute to an economic research/journal presented in the AEA annual meeting. This visualizations highlights the amount of male and females in contributing to research/journals disregarding the roles
df_sum <- df %>%
group_by(year) %>%
summarize(
male = sum(male),
female = sum(female)
)
plot_ly(data = df_sum, mode = 'markers',
marker = list(size = 10)) %>%
add_trace(x = ~year, y = ~male, type = 'scatter', mode = 'markers', name = 'Male',
hoverinfo = 'text', text = ~paste("Year: ", year, "<br>",
"Male Contributions: ", male)) %>%
add_trace(x = ~year, y = ~female, type = 'scatter', mode = 'markers', name = 'Female',
hoverinfo = 'text', text = ~paste("Year: ", year, "<br>",
"Female Contributions: ", female)) %>%
layout(
title = "Number of Contributions by Year and Gender",
xaxis = list(title = "Year"),
yaxis = list(title = "Number of Contributions"),
hovermode = 'closest'
)
From the chart it is quite visible that there is a huge gap with it’s peak at around the 1950 to 1970. The early years from 1910 to 1940 shows a small gap between males and females, but this is doesn’t necessarily mean that there are equal, it may be due to the low amount of economic research papers that are published or presented that year. Although there is an impact in 1920 that changed and leveraged woman’s contribution which is the change which allowed woman to vote. This change is visible as before 1920 there are only 4 recorded contributions of female while after 1920, there are 8 contributions within 10 years. ==================================================================================================================
#Creating two interactive pie chart using plotly that shows the Comparison Gender Contribution on Year 1980 & 2020
# Filter data for the year 2020
df_2020 <- df %>%
filter(year == 2020) %>%
summarize(
male_total = sum(male),
female_total = sum(female)
)
# Create dataframe for pie chart
df_pie_2020 <- data.frame(
gender = c("Male", "Female"),
total = c(df_2020$male_total, df_2020$female_total)
)
# Filter data for the year 1980
df_1980 <- df %>%
filter(year == 1980) %>%
summarize(
male_total = sum(male),
female_total = sum(female)
)
# Create dataframe for pie chart
df_pie_1980 <- data.frame(
gender = c("Male", "Female"),
total = c(df_1980$male_total, df_1980$female_total)
)
# Create interactive pie chart for the year 2020 with plotly
plot_ly(df_pie_2020, labels = ~gender, values = ~total, type = 'pie', textinfo = 'label+percent',
insidetextorientation = 'radial') %>%
layout(title = "Contribution of Male and Female in 2020")
# Create interactive pie chart for the year 1980 with plotly
plot_ly(df_pie_1980, labels = ~gender, values = ~total, type = 'pie', textinfo = 'label+percent',
insidetextorientation = 'radial') %>%
layout(title = "Contribution of Male and Female in 1980")
The chart shows that in 1980, females only had a contribution of 9.57%, whereas in 2020, females contributed 36.1%. Although females do not dominate the contributions untill now, there has been a significant increase in female contributions over the past 40 years.
==================================================================================================================
==================================================================================================================